library(readxl)
sm <- read_excel("/Users/user/Downloads/social_media_cleaned.xlsx")
str(sm)
## tibble [21 × 13] (S3: tbl_df/tbl/data.frame)
##  $ character             : chr [1:21] "masinl" "peace" "Patty" "Bunny" ...
##  $ Instagram             : num [1:21] 3.5 7.73 3.77 5.38 12 ...
##  $ LinkedIn              : num [1:21] 4 5.2 7 5.317 0.583 ...
##  $ SnapChat              : num [1:21] 1 3.683 0.533 1.3 0 ...
##  $ Twitter               : num [1:21] 5 0 0 0 0.667 ...
##  $ Whatsapp              : num [1:21] 1 4.18 9.83 5.3 3 ...
##  $ youtube               : num [1:21] 2.5 4.25 1.85 2 3.5 7 3 2 4 3 ...
##  $ OTT                   : num [1:21] 14.5 0 2 2 2 3 0 3 3 0 ...
##  $ Reddit                : num [1:21] 2.5 0 0 0 1 0 0 0 0 0 ...
##  $ Trouble_falling_asleep: num [1:21] 0 1 0 0 1 0 0 1 0 0 ...
##  $ productivity          : num [1:21] 1 1 1 1 1 1 1 1 1 0 ...
##  $ Tired_morning         : num [1:21] 0 0 0 0 1 0 1 1 0 0 ...
##  $ weekenergy            : num [1:21] 3 3 4 4 3 5 4 3 3 2 ...

1.Data Collection Process

#The data was generated based on the usage of social_media apps (in hours) of our MVA class students and average of three weeks data was consolidated and taken for the analysis.

#Dependent Variables:Trouble_falling_asleep,productivity,Tired_morning,weekenergy

#Independent Variables:Instagram,LinkedIn,SnapChat,Twitter,Whatsapp,youtube,OTT,Reddit

#About the dataset * Character ID: Unique ID of each student’s data entry. * Instagram: Instagram app usage duration measured in hours. * LinkedIn:LinkenIN app usage duration measured in hours. * Snapchat:Snapchat app usage duration measured in hours. * Twitter Usage:Twitter app usage duration measured in hours. * Whatsapp Usage:Whatsapp app usage duration measured in hours. * Youtube Usage:Youtube app usage duration measured in hours. * OTT Usage: Over-the-Top(OTT) media services usage duration in hours. * Reddit Usage:Reddit app usage duration measured in hours. * Trouble Falling Asleep: Indicates whether the student reported having trouble falling asleep (0: No, 1: Yes). * Productivity: student’s mood and productivity level(0: Bad, 1: Good) * Tiredness upon Waking Up in the Morning: Indicates the level of tiredness the student reported upon waking up in the morning(0: Low, 1: High). * Weekenergy: Indicates the level of energy the student felt entire week measured on a scale of 5.(where 5 :High and 1:low)

Goal of my analysis

To understand the impact of various social media apps usage on the lifestyle pattern of the students.

2.Exploratory Data Analysis and Visualizations

sm1 <- sm[, 2:13]
sm1
## # A tibble: 21 × 12
##    Instagram LinkedIn SnapChat Twitter Whatsapp youtube   OTT Reddit
##        <dbl>    <dbl>    <dbl>   <dbl>    <dbl>   <dbl> <dbl>  <dbl>
##  1     3.5      4        1       5         1       2.5   14.5    2.5
##  2     7.73     5.2      3.68    0         4.18    4.25   0      0  
##  3     3.77     7        0.533   0         9.83    1.85   2      0  
##  4     5.38     5.32     1.3     0         5.3     2      2      0  
##  5    12        0.583    0       0.667     3       3.5    2      1  
##  6     2.33     7        0.467   0        12       7      3      0  
##  7     5.37     4        0       0         6       3      0      0  
##  8     7        4        3       0        10       2      3      0  
##  9     8.65    10        3.83    0         6.15    4      3      0  
## 10     0.167    0        0       0         1       3      0      0  
## # ℹ 11 more rows
## # ℹ 4 more variables: Trouble_falling_asleep <dbl>, productivity <dbl>,
## #   Tired_morning <dbl>, weekenergy <dbl>
summary(sm1)
##    Instagram          LinkedIn         SnapChat          Twitter      
##  Min.   : 0.1667   Min.   : 0.000   Min.   : 0.0000   Min.   :0.0000  
##  1st Qu.: 3.7667   1st Qu.: 1.917   1st Qu.: 0.0000   1st Qu.:0.0000  
##  Median : 5.3833   Median : 3.917   Median : 0.5333   Median :0.0000  
##  Mean   : 5.9230   Mean   : 3.624   Mean   : 1.9738   Mean   :0.5802  
##  3rd Qu.: 7.0000   3rd Qu.: 5.000   3rd Qu.: 1.8667   3rd Qu.:0.2667  
##  Max.   :15.0167   Max.   :10.000   Max.   :14.8667   Max.   :5.0000  
##     Whatsapp         youtube           OTT             Reddit      
##  Min.   : 1.000   Min.   :0.000   Min.   : 0.000   Min.   :0.0000  
##  1st Qu.: 3.667   1st Qu.:2.000   1st Qu.: 0.000   1st Qu.:0.0000  
##  Median : 6.000   Median :3.000   Median : 1.683   Median :0.0000  
##  Mean   : 6.430   Mean   :2.973   Mean   : 2.361   Mean   :0.5243  
##  3rd Qu.: 8.917   3rd Qu.:4.000   3rd Qu.: 2.467   3rd Qu.:0.0000  
##  Max.   :15.350   Max.   :7.000   Max.   :14.500   Max.   :7.0000  
##  Trouble_falling_asleep  productivity    Tired_morning      weekenergy   
##  Min.   :0.0000         Min.   :0.0000   Min.   :0.0000   Min.   :2.000  
##  1st Qu.:0.0000         1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:3.000  
##  Median :0.0000         Median :1.0000   Median :0.0000   Median :3.000  
##  Mean   :0.3333         Mean   :0.9524   Mean   :0.3333   Mean   :3.381  
##  3rd Qu.:1.0000         3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:4.000  
##  Max.   :1.0000         Max.   :1.0000   Max.   :1.0000   Max.   :5.000

#The Summary Statistics helps us to understand the spread of the data.

#Calculating mean and covariance

colmean <- colMeans(sm1)
colmean
##              Instagram               LinkedIn               SnapChat 
##              5.9230159              3.6239683              1.9738095 
##                Twitter               Whatsapp                youtube 
##              0.5801587              6.4295238              2.9725397 
##                    OTT                 Reddit Trouble_falling_asleep 
##              2.3607937              0.5242857              0.3333333 
##           productivity          Tired_morning             weekenergy 
##              0.9523810              0.3333333              3.3809524
covariance <- cov(sm1)
covariance
##                         Instagram    LinkedIn    SnapChat     Twitter
## Instagram              12.3713466  0.84502910  3.53067460 -0.87639550
## LinkedIn                0.8450291  6.12742513  0.21894246 -0.41554511
## SnapChat                3.5306746  0.21894246 12.00701587 -0.80480952
## Twitter                -0.8763955 -0.41554511 -0.80480952  1.66576720
## Whatsapp                5.3507421  2.28152698  1.13048413 -2.57754048
## youtube                 2.0167692  1.94490608 -0.96373016 -0.42199431
## OTT                     3.3204392  1.62114392  1.60984127  2.53849431
## Reddit                 -0.4179702 -0.02756786 -0.44789881  0.33895595
## Trouble_falling_asleep  0.8586111 -0.14905556  0.79416667 -0.04888889
## productivity            0.2878175  0.18119841  0.09869048  0.02900794
## Tired_morning           0.1169444 -0.39155556  0.31666667 -0.16972222
## weekenergy             -0.2242063  0.74891270  0.15464286 -0.11123016
##                           Whatsapp      youtube           OTT      Reddit
## Instagram               5.35074206  2.016769180  3.3204391534 -0.41797024
## LinkedIn                2.28152698  1.944906085  1.6211439153 -0.02756786
## SnapChat                1.13048413 -0.963730159  1.6098412698 -0.44789881
## Twitter                -2.57754048 -0.421994312  2.5384943122  0.33895595
## Whatsapp               16.22280476  2.601174603  1.9001670635 -0.86244286
## youtube                 2.60117460  3.018996561  0.9850062169  0.07305357
## OTT                     1.90016706  0.985006217 12.4655743386  1.30897143
## Reddit                 -0.86244286  0.073053571  1.3089714286  2.53638571
## Trouble_falling_asleep -0.12266667  0.032444444 -0.0007777778 -0.10800000
## productivity            0.27147619 -0.001373016  0.1180396825  0.02621429
## Tired_morning          -0.08266667 -0.298388889 -0.3774444444 -0.10800000
## weekenergy              0.38585714  0.280484127 -0.3188174603 -0.15921429
##                        Trouble_falling_asleep productivity Tired_morning
## Instagram                        0.8586111111  0.287817460    0.11694444
## LinkedIn                        -0.1490555556  0.181198413   -0.39155556
## SnapChat                         0.7941666667  0.098690476    0.31666667
## Twitter                         -0.0488888889  0.029007937   -0.16972222
## Whatsapp                        -0.1226666667  0.271476190   -0.08266667
## youtube                          0.0324444444 -0.001373016   -0.29838889
## OTT                             -0.0007777778  0.118039683   -0.37744444
## Reddit                          -0.1080000000  0.026214286   -0.10800000
## Trouble_falling_asleep           0.2333333333  0.016666667    0.08333333
## productivity                     0.0166666667  0.047619048    0.01666667
## Tired_morning                    0.0833333333  0.016666667    0.23333333
## weekenergy                       0.0666666667  0.069047619    0.06666667
##                         weekenergy
## Instagram              -0.22420635
## LinkedIn                0.74891270
## SnapChat                0.15464286
## Twitter                -0.11123016
## Whatsapp                0.38585714
## youtube                 0.28048413
## OTT                    -0.31881746
## Reddit                 -0.15921429
## Trouble_falling_asleep  0.06666667
## productivity            0.06904762
## Tired_morning           0.06666667
## weekenergy              0.54761905
#Compute correlation matrix
cor_matrix <- cor(sm[, 2:13])
#Create a heatmap of the correlation matrix
heatmap(cor_matrix, main = "Heatmap of Correlation Matrix")

#From covariance we can understand how two variables are linearly related. A good positive covariance indicates a strong linear relationship between the variables. I could see weekenergy and LinkedIn app usage having a good linear relationship based on covariance. This affirmation was made strong by the heatmap of correlation matrix where the pale color shows low correlation and the dark color shows high correlation among the variables.

#Mahalanobis distances for each observation
sm_d <- apply(sm1, MARGIN = 1, function(sm1)t(sm1 - colmean) %*% solve(covariance) %*% (sm1 - colmean))
sm_d
##  [1] 17.082159 10.431647  8.097699  9.464283 10.004631 12.785065  6.094650
##  [8] 11.009514 13.558265 19.047619 12.678089  9.556442 18.038751  4.702959
## [15] 18.706311  9.931812 10.823361  5.864670  6.454342 13.839615 11.828117

#Mahalanobis distance calculates the distance between each observation to the mean of the distribution.From mean my observation is at a distance of 13.55. 1st,10th,13th and 15th observations are far away from the mean in our dataset.Mahalanobis distance helped me to estimate the possible outliers.

#starplot
stars(sm1)

#with starplot we can immediately identify the observations with similarities. Here 3,4,6,9,12,14,17,19 users pattern of social media apps usage are a bit similar.

pairs(sm1)

#Boxplot of the apps
boxplot(sm[,2:9])

#Boxplot helps us to understand the central tendendency, spread, range and outliers in the dataset.I could see instagram and whatsapp are very predominant among all the apps.

Focusing on instagram and whatsapp effect on dependent variables

#Visualization of Instagram and whatsapp impact on entire weeks energy

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
ggplot(sm1, aes(x =Instagram, fill = factor(weekenergy))) +
  geom_density(alpha = 0.5) +
  labs(title = "Plot of Instagram usage and weekenergy",
       x = "Instagram Usage",
       y = "Density",
       fill = "entire weeks energy")
## Warning: Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

ggplot(sm1, aes(x =Whatsapp, fill = factor(weekenergy))) +
  geom_density(alpha = 0.5) +
  labs(title = "Plot of Whatsapp usage and weekenergy",
       x = "Whatapp Usage",
       y = "Density",
       fill = "Entire weeks energy")
## Warning: Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

#We could see that people who are spending less time on instagram and whatsapp are having more energy the entire week.

#Visualization of Instagram and whatsapp impact on Trouble_falling_asleep

library(ggplot2)
ggplot(sm1, aes(x =Instagram, fill = factor(sm1$Trouble_falling_asleep))) +
  geom_density(alpha = 0.5) +
  labs(title = "Plot of Instagram usage and sleep pattern",
       x = "Instagram Usage",
       y = "Density",
       fill = "trouble in falling asleep")
## Warning: Use of `sm1$Trouble_falling_asleep` is discouraged.
## ℹ Use `Trouble_falling_asleep` instead.

library(ggplot2)
ggplot(sm1, aes(x =Whatsapp, fill = factor(sm1$Trouble_falling_asleep))) +
  geom_density(alpha = 0.5) +
  labs(title = "Plot of Whatsapp usage and sleep pattern",
       x = "Whatsapp Usage",
       y = "Density",
       fill = "trouble in falling asleep")
## Warning: Use of `sm1$Trouble_falling_asleep` is discouraged.
## ℹ Use `Trouble_falling_asleep` instead.

#There is a high impact of whatsapp and instagram usage on sleep pattern of the students.

#Visualization of Instagram and whatsapp impact on Tired mornings

library(ggplot2)
ggplot(sm1, aes(x =Instagram, fill = factor(sm1$Tired_morning))) +
  geom_density(alpha = 0.5) +
  labs(title = "Plot of Instagram usage and morning tiredness",
       x = "Instagram Usage",
       y = "Density",
       fill = "Tired Morning")
## Warning: Use of `sm1$Tired_morning` is discouraged.
## ℹ Use `Tired_morning` instead.

library(ggplot2)
ggplot(sm1, aes(x =Whatsapp, fill = factor(sm1$Tired_morning))) +
  geom_density(alpha = 0.5) +
  labs(title = "Plot of Whatsapp usage and morning tiredness",
       x = "Whatsapp Usage",
       y = "Density",
       fill = "Tired Morning")
## Warning: Use of `sm1$Tired_morning` is discouraged.
## ℹ Use `Tired_morning` instead.

#Instagram and whatsapp usage having a direct impact on tired mornings.

##Visualization of Instagram and whatsapp impact on productivity.

library(ggplot2)
ggplot(sm1, aes(x =Instagram, fill = factor(sm1$productivity))) +
  geom_density(alpha = 0.5) +
  labs(title = "Plot of Instagram usage and productivity",
       x = "Instagram Usage",
       y = "Density",
       fill = "productivity")
## Warning: Use of `sm1$productivity` is discouraged.
## ℹ Use `productivity` instead.
## Warning: Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

library(ggplot2)
ggplot(sm1, aes(x = Whatsapp, fill = factor(sm1$productivity))) +
  geom_density(alpha = 0.5) +
  labs(title = "Plot of Whatsapp usage and productivity",
       x = "Whatsapp Usage",
       y = "Density",
       fill = "productivity")
## Warning: Use of `sm1$productivity` is discouraged.
## ℹ Use `productivity` instead.
## Warning: Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

#Interestingly, stundents are productive even with high usage of instagram and whatsapp. This tells me a fact that instagram and whatsapp were used by students for good purposes also like learning new trends in technology and social world.

Application of different MVA models and Insights.

PCA

sm_pca <- prcomp(sm1[1:8],scale=TRUE) 
sm_pca
## Standard deviations (1, .., p=8):
## [1] 1.4936735 1.3063036 1.1303414 0.9384308 0.9007674 0.7694819 0.6078953
## [8] 0.3621675
## 
## Rotation (n x k) = (8 x 8):
##                   PC1         PC2          PC3         PC4         PC5
## Instagram  0.43537868 -0.15356106  0.377944297  0.03258545 -0.36643468
## LinkedIn   0.35649468 -0.20991651 -0.329259391 -0.19907722  0.70255046
## SnapChat   0.15947567  0.03392372  0.717791182  0.10281712  0.51019404
## Twitter   -0.39046510 -0.53815939  0.040482193 -0.33406772 -0.09484716
## Whatsapp   0.52694693  0.06079220  0.007167203  0.09132133 -0.28308505
## youtube    0.45370070 -0.20727847 -0.406762139 -0.03615301 -0.09453089
## OTT        0.08162766 -0.68423619  0.216480569 -0.12793328 -0.07632718
## Reddit    -0.12361633 -0.35601755 -0.139687974  0.90062217  0.08919336
##                   PC6         PC7         PC8
## Instagram -0.51184307 -0.48893799 -0.08742222
## LinkedIn   0.02630366 -0.42220705 -0.09096859
## SnapChat  -0.04887482  0.39304838 -0.17449187
## Twitter   -0.02585534  0.04614556 -0.65794190
## Whatsapp   0.68926770  0.02263817 -0.39306576
## youtube   -0.39774620  0.64519354 -0.03187491
## OTT        0.31710521  0.06567287  0.59264873
## Reddit    -0.02048963 -0.07045776 -0.11831273

#From the PCA analysis, we can find the association of different apps with each principal components using which we can reduce the dimensionality of the dataset. Higher absolute values (close to 1) suggest a stronger relationship, while values closer to 0 indicate a weaker association

#For example snapchat app has strong association with PC3

summary(sm_pca)
## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5     PC6     PC7
## Standard deviation     1.4937 1.3063 1.1303 0.9384 0.9008 0.76948 0.60790
## Proportion of Variance 0.2789 0.2133 0.1597 0.1101 0.1014 0.07401 0.04619
## Cumulative Proportion  0.2789 0.4922 0.6519 0.7620 0.8634 0.93741 0.98360
##                           PC8
## Standard deviation     0.3622
## Proportion of Variance 0.0164
## Cumulative Proportion  1.0000
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_eig(sm_pca, addlabels = TRUE)

#Proportion of total variance here tells about the variance explained by the principal components.

#From the screeplot it is evident that to cover variance above 90% we need to consider PC1 to PC6. I could interpret that PCA is not that beneficial for the social media dataset because we are allowed to discard only PC7 and PC8

fviz_pca_var(sm_pca,col.var = "cos2",
             gradient.cols = c("#FFCC00", "#CC9933", "#660033", "#330033"),
             repel = TRUE)

#From the above plot we can understand the usage of Twitter and Reddit apps are very similar among all. I could tell students are not using twitter and reddit apps that much.

Factor Analysis

# load library for factor analysis
library(ggplot2)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

#Parallel analysis

fa.parallel(sm1[1:8])

## Parallel analysis suggests that the number of factors =  0  and the number of components =  0
fit.pc <- principal(sm1[1:8], nfactors=2, rotate="varimax")
fit.pc
## Principal Components Analysis
## Call: principal(r = sm1[1:8], nfactors = 2, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
##             RC1   RC2    h2   u2 com
## Instagram  0.68  0.01 0.463 0.54 1.0
## LinkedIn   0.59  0.11 0.359 0.64 1.1
## SnapChat   0.22 -0.11 0.059 0.94 1.5
## Twitter   -0.36  0.84 0.834 0.17 1.4
## Whatsapp   0.73 -0.30 0.626 0.37 1.3
## youtube    0.73  0.07 0.533 0.47 1.0
## OTT        0.37  0.82 0.814 0.19 1.4
## Reddit    -0.04  0.50 0.250 0.75 1.0
## 
##                        RC1  RC2
## SS loadings           2.19 1.75
## Proportion Var        0.27 0.22
## Cumulative Var        0.27 0.49
## Proportion Explained  0.56 0.44
## Cumulative Proportion 0.56 1.00
## 
## Mean item complexity =  1.2
## Test of the hypothesis that 2 components are sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.13 
##  with the empirical chi square  21.06  with prob <  0.072 
## 
## Fit based upon off diagonal values = 0.71

#High absolute values (close to 1) indicate a strong relationship between the variable and the factor. #h2 explains how much variance of the variables are explained by the factors. #u2 indicates the amount of variance not explained by the factors #Reddit,OTT, twitter are better explained by RC2 and all other apps like Instagram,LinkedIn,SnapChat,Whatsapp,youtube are well explained by RC1.

round(fit.pc$values, 3)
## [1] 2.231 1.706 1.278 0.881 0.811 0.592 0.370 0.131
fit.pc$loadings
## 
## Loadings:
##           RC1    RC2   
## Instagram  0.681       
## LinkedIn   0.589  0.111
## SnapChat   0.216 -0.110
## Twitter   -0.359  0.840
## Whatsapp   0.732 -0.300
## youtube    0.727       
## OTT        0.371  0.822
## Reddit            0.498
## 
##                  RC1   RC2
## SS loadings    2.189 1.749
## Proportion Var 0.274 0.219
## Cumulative Var 0.274 0.492
# Communalities
fit.pc$communality
##  Instagram   LinkedIn   SnapChat    Twitter   Whatsapp    youtube        OTT 
## 0.46314710 0.35873574 0.05870521 0.83436255 0.62581187 0.53256680 0.81378028 
##     Reddit 
## 0.25038016
# Rotated factor scores, Notice the columns ordering: RC1, RC2
fit.pc
## Principal Components Analysis
## Call: principal(r = sm1[1:8], nfactors = 2, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
##             RC1   RC2    h2   u2 com
## Instagram  0.68  0.01 0.463 0.54 1.0
## LinkedIn   0.59  0.11 0.359 0.64 1.1
## SnapChat   0.22 -0.11 0.059 0.94 1.5
## Twitter   -0.36  0.84 0.834 0.17 1.4
## Whatsapp   0.73 -0.30 0.626 0.37 1.3
## youtube    0.73  0.07 0.533 0.47 1.0
## OTT        0.37  0.82 0.814 0.19 1.4
## Reddit    -0.04  0.50 0.250 0.75 1.0
## 
##                        RC1  RC2
## SS loadings           2.19 1.75
## Proportion Var        0.27 0.22
## Cumulative Var        0.27 0.49
## Proportion Explained  0.56 0.44
## Cumulative Proportion 0.56 1.00
## 
## Mean item complexity =  1.2
## Test of the hypothesis that 2 components are sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.13 
##  with the empirical chi square  21.06  with prob <  0.072 
## 
## Fit based upon off diagonal values = 0.71
fit.pc$scores
##               RC1         RC2
##  [1,] -0.49593354  3.81935345
##  [2,]  0.37453109 -0.45810128
##  [3,]  0.24035060 -0.39724117
##  [4,] -0.11821501 -0.28545411
##  [5,] -0.05535776  0.18730345
##  [6,]  1.33982873 -0.15394397
##  [7,] -0.16544326 -0.57370308
##  [8,]  0.36239954 -0.37616101
##  [9,]  1.27403174  0.04737682
## [10,] -1.47503111 -0.57670717
## [11,] -1.47090422  0.33142379
## [12,] -0.99564293  0.65382335
## [13,] -0.64587802 -0.84729946
## [14,] -0.54403365 -0.04349169
## [15,] -0.05025441  0.89199339
## [16,]  0.46518890 -0.19297427
## [17,]  0.63875357 -0.56313735
## [18,] -0.42630523 -0.44123138
## [19,]  0.50350291 -0.59787800
## [20,]  2.63240375  0.51258979
## [21,] -1.38799166 -0.93654010
fa.plot(fit.pc) # See Correlations within Factors

#Factors that contribute to RC1 and RC2 Visualization
fa.diagram(fit.pc)

#some visualizations using the factors

#very simple structure visualization
vss(sm1)
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## 
## Very Simple Structure
## Call: vss(x = sm1)
## Although the VSS complexity 1 shows  8  factors, it is probably more reasonable to think about  5  factors
## VSS complexity 2 achieves a maximimum of 0.78  with  6  factors
## 
## The Velicer MAP achieves a minimum of 0.07  with  1  factors 
## BIC achieves a minimum of  -91.91  with  1  factors
## Sample Size adjusted BIC achieves a minimum of  7.85  with  7  factors
## 
## Statistics by number of factors 
##   vss1 vss2   map dof chisq  prob sqresid  fit RMSEA   BIC SABIC complex eChisq
## 1 0.32 0.00 0.065  54  72.5 0.047   13.31 0.32 0.118 -91.9  74.8     1.0  96.12
## 2 0.42 0.55 0.073  43  57.5 0.068    8.87 0.55 0.117 -73.4  59.4     1.4  54.81
## 3 0.51 0.66 0.083  33  40.8 0.165    5.81 0.70 0.094 -59.7  42.2     1.5  26.88
## 4 0.54 0.72 0.096  24  27.9 0.264    3.87 0.80 0.073 -45.2  28.9     1.6  11.30
## 5 0.55 0.74 0.119  16  19.5 0.242    2.82 0.86 0.090 -29.2  20.2     1.7   5.24
## 6 0.54 0.78 0.162   9  14.5 0.104    2.07 0.89 0.164 -12.9  14.9     1.8   2.95
## 7 0.55 0.72 0.194   3   7.7 0.052    1.81 0.91 0.269  -1.4   7.8     2.0   1.17
## 8 0.59 0.76 0.265  -2   4.9    NA    0.92 0.95    NA    NA    NA     1.9   0.36
##    SRMR eCRMS eBIC
## 1 0.186 0.206  -68
## 2 0.141 0.174  -76
## 3 0.098 0.139  -74
## 4 0.064 0.106  -62
## 5 0.043 0.088  -43
## 6 0.033 0.088  -24
## 7 0.021 0.096   -8
## 8 0.011    NA   NA
# Computing Correlation Matrix
corrm.sm <- cor(sm1)
corrm.sm
##                          Instagram     LinkedIn    SnapChat     Twitter
## Instagram               1.00000000  0.097056399  0.28968877 -0.19305653
## LinkedIn                0.09705640  1.000000000  0.02552545 -0.13006846
## SnapChat                0.28968877  0.025525452  1.00000000 -0.17995686
## Twitter                -0.19305653 -0.130068464 -0.17995686  1.00000000
## Whatsapp                0.37769615  0.228835623  0.08099980 -0.49583293
## youtube                 0.33000187  0.452197669 -0.16006877 -0.18817769
## OTT                     0.26738122  0.185492527  0.13158590  0.55707401
## Reddit                 -0.07461553 -0.006992884 -0.08116237  0.16490296
## Trouble_falling_asleep  0.50535856 -0.124658176  0.47446668 -0.07841779
## productivity            0.37498880  0.335447863  0.13051708  0.10299576
## Tired_morning           0.06883078 -0.327465831  0.18918923 -0.27223450
## weekenergy             -0.08613906  0.408839562  0.06030774 -0.11645982
##                           Whatsapp      youtube           OTT       Reddit
## Instagram               0.37769615  0.330001869  0.2673812155 -0.074615529
## LinkedIn                0.22883562  0.452197669  0.1854925268 -0.006992884
## SnapChat                0.08099980 -0.160068767  0.1315859038 -0.081162369
## Twitter                -0.49583293 -0.188177691  0.5570740080  0.164902964
## Whatsapp                1.00000000  0.371685163  0.1336203696 -0.134449660
## youtube                 0.37168516  1.000000000  0.1605652338  0.026399913
## OTT                     0.13362037  0.160565234  1.0000000000  0.232791099
## Reddit                 -0.13444966  0.026399913  0.2327910994  1.000000000
## Trouble_falling_asleep -0.06304856  0.038656332 -0.0004560485 -0.140387265
## productivity            0.30887192 -0.003621212  0.1532080319  0.075429291
## Tired_morning          -0.04248925 -0.355519106 -0.2213138016 -0.140387265
## weekenergy              0.12945663  0.218141205 -0.1220243994 -0.135093704
##                        Trouble_falling_asleep productivity Tired_morning
## Instagram                        0.5053585585  0.374988801    0.06883078
## LinkedIn                        -0.1246581760  0.335447863   -0.32746583
## SnapChat                         0.4744666806  0.130517080    0.18918923
## Twitter                         -0.0784177942  0.102995756   -0.27223450
## Whatsapp                        -0.0630485641  0.308871917   -0.04248925
## youtube                          0.0386563318 -0.003621212   -0.35551911
## OTT                             -0.0004560485  0.153208032   -0.22131380
## Reddit                          -0.1403872651  0.075429291   -0.14038727
## Trouble_falling_asleep           1.0000000000  0.158113883    0.35714286
## productivity                     0.1581138830  1.000000000    0.15811388
## Tired_morning                    0.3571428571  0.158113883    1.00000000
## weekenergy                       0.1865009616  0.427581673    0.18650096
##                         weekenergy
## Instagram              -0.08613906
## LinkedIn                0.40883956
## SnapChat                0.06030774
## Twitter                -0.11645982
## Whatsapp                0.12945663
## youtube                 0.21814120
## OTT                    -0.12202440
## Reddit                 -0.13509370
## Trouble_falling_asleep  0.18650096
## productivity            0.42758167
## Tired_morning           0.18650096
## weekenergy              1.00000000
plot(corrm.sm)

social_pca <- prcomp(sm1[1:8], scale=TRUE)
summary(social_pca)
## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5     PC6     PC7
## Standard deviation     1.4937 1.3063 1.1303 0.9384 0.9008 0.76948 0.60790
## Proportion of Variance 0.2789 0.2133 0.1597 0.1101 0.1014 0.07401 0.04619
## Cumulative Proportion  0.2789 0.4922 0.6519 0.7620 0.8634 0.93741 0.98360
##                           PC8
## Standard deviation     0.3622
## Proportion of Variance 0.0164
## Cumulative Proportion  1.0000
plot(social_pca)

#Biplot Visualization

biplot(fit.pc)

#I feel factor analysis is not beneficial for the social media data because I observed that we are missing the most part of the uniqueness of these apps by including factors and we are able to capture only a small portion of variances by using factors. #And parallel analysis screeplot indicated that the ideal number of factors for the social media data is zero. #From the component analysis we got similar results to PCA, where the apps like Instagram, whatsapp/wechat, LinkedIn, Youtube, Snapchat usages are a bit similar and high compared to OTT, Twitter and Reddit.

Cluster Analysis

#Hierarchical Clustering- Dendrogram
sm_scaled <- scale(sm1)
dist_matrix <- dist(sm_scaled)

#Clustering Single
hc <- hclust(dist_matrix,method = "single")
fviz_dend(hc)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#Default Clustering

hc <- hclust(dist_matrix)
plot(hc, hang = -1, cex = 0.6, main = "Dendrogram for Hierarchical Clustering")

#Average Clustering

hc <- hclust(dist_matrix,method = "average")
plot(hc, hang = -1, cex = 0.6, main = "Dendrogram for Hierarchical Clustering")

#By observing the above dendrogram’s k=2 clusters will be sufficient to group the entire students of the class.This is confirmed further with D index graphical representation.

#Non-Hierarchical Clustering(k-means)
num_clusters <- 2
kmeans_model <- kmeans(sm_scaled, centers = num_clusters)

# Membership for each cluster
table(kmeans_model$cluster)
## 
##  1  2 
##  7 14
# Principal Components
pca_result <- prcomp(sm_scaled,scale=TRUE)
pca_result
## Standard deviations (1, .., p=12):
##  [1] 1.6203942 1.4754300 1.3310928 1.1776881 0.9897347 0.9245638 0.8967807
##  [8] 0.7514383 0.6440153 0.4723495 0.4045767 0.1839881
## 
## Rotation (n x k) = (12 x 12):
##                                PC1          PC2         PC3         PC4
## Instagram               0.42816828  0.040374089 -0.27669154  0.36077066
## LinkedIn                0.31458993 -0.375066753  0.14658080 -0.25512070
## SnapChat                0.24530326  0.279812642 -0.28852512  0.08458621
## Twitter                -0.27287007 -0.245837113 -0.49833107 -0.25725953
## Whatsapp                0.40478489 -0.117263785  0.20574594  0.25457749
## youtube                 0.30854324 -0.364491106  0.17997327  0.21460358
## OTT                     0.07977957 -0.348086166 -0.52220162  0.08187832
## Reddit                 -0.11757332 -0.226467324 -0.22416407 -0.02858350
## Trouble_falling_asleep  0.28246439  0.358670833 -0.31587576  0.04233834
## productivity            0.35825558 -0.053909034 -0.20839294 -0.43253653
## Tired_morning           0.08137741  0.517992460 -0.03369194 -0.19614866
## weekenergy              0.29846249  0.006382348  0.16843149 -0.62103181
##                                PC5         PC6         PC7          PC8
## Instagram              -0.04578095  0.05108928  0.28555148 -0.325087718
## LinkedIn                0.19120246 -0.10250708 -0.30832764 -0.195679386
## SnapChat                0.15885641 -0.16531309 -0.72105808  0.051423328
## Twitter                 0.18303488  0.25159733  0.16826747  0.008552916
## Whatsapp               -0.41429227  0.29631693 -0.11055036  0.277673487
## youtube                 0.28970917 -0.24318967  0.35454996  0.237554134
## OTT                    -0.02365440  0.20285346 -0.10917199  0.513644260
## Reddit                 -0.49365028 -0.77780629  0.01903676  0.039910022
## Trouble_falling_asleep  0.36779356 -0.24340561  0.25673461 -0.033294256
## productivity           -0.39794597  0.18254386  0.08750531 -0.416952523
## Tired_morning          -0.27688783  0.01151556  0.21367750  0.428408387
## weekenergy              0.18024711 -0.10234403  0.07562961  0.309613656
##                                PC9        PC10        PC11        PC12
## Instagram              -0.17782279  0.15803981 -0.54472544  0.25115168
## LinkedIn               -0.58222718 -0.27931005  0.12757002  0.24512539
## SnapChat                0.16360056  0.38914908  0.08535895  0.10251068
## Twitter                 0.15950829  0.11258940  0.19450385  0.59114180
## Whatsapp                0.34392387 -0.29798515  0.17846889  0.35715770
## youtube                 0.03795324  0.48373698  0.35681581 -0.07438099
## OTT                    -0.20642806 -0.14358794 -0.15991649 -0.43221675
## Reddit                  0.07648933 -0.08051441 -0.04455224  0.13738996
## Trouble_falling_asleep  0.15983503 -0.56114478  0.28533231 -0.08132826
## productivity            0.11824824  0.19017568  0.29333104 -0.35895531
## Tired_morning          -0.52328057  0.17804741  0.18472916  0.19953066
## weekenergy              0.30926178  0.01575160 -0.50356016  0.05312393
# Visualize cluster and membership using first two Principal Components
fviz_cluster(list(data = pca_result$x[, 1:2], cluster = kmeans_model$cluster))

#This plot visualizes clusters and their memberships using the first two principal components.

# Visualize cluster centers for k-means
fviz_cluster(kmeans_model, data = sm_scaled, geom = "point", frame.type = "convex", 
             pointsize = 2, fill = "white", main = "K-means Cluster Centers")
## Warning: argument frame is deprecated; please use ellipse instead.
## Warning: argument frame.type is deprecated; please use ellipse.type instead.

# Visualize cluster and membership using first two Principal Components for k-means
pca_result <- prcomp(sm_scaled, scale = TRUE)
fviz_cluster(kmeans_model, data = pca_result$x[, 1:2], geom = "point", 
             pointsize = 2, fill = "white", main = "K-means Clustering Result (PCA)")

#This visualization helps to understand how the data points are grouped into clusters based on their similarities, as revealed by the PCA analysis.

library(cluster)
# Calculate silhouette information for k-means clustering
sil <- silhouette(kmeans_model$cluster, dist(sm_scaled))

# Visualize the silhouette plot for k-means clustering
fviz_silhouette(sil, main = "Silhouette Plot for K-means Clustering")
##   cluster size ave.sil.width
## 1       1    7          0.00
## 2       2   14          0.19

#A higher silhouette width indicates better separation of clusters, while negative values suggest that points might be assigned to the wrong clusters. This plot helps in determining the optimal number of clusters for k-means clustering and assessing the overall clustering performance.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(NbClust)

#optimal cluster method/visualization
res.nbclust <- sm_scaled[,1:8] %>% scale() %>% NbClust(distance = "euclidean", min.nc = 2, max.nc = 10, method = "complete", index ="all") 
## Warning in pf(beale, pp, df2): NaNs produced

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 8 proposed 2 as the best number of clusters 
## * 3 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 2 proposed 5 as the best number of clusters 
## * 2 proposed 6 as the best number of clusters 
## * 4 proposed 8 as the best number of clusters 
## * 4 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************

#The Dindex suggests the optimal number of clusters according to majority rule is 2. #Through cluster analysis I am able to figure out users whose social media usage pattern is similar to mine. cluster analysis helped to group students based on hidden patterns of their social media usage based on which any further analysis can be done.

Regression

#Multiple Regression

1.Model Development

#Performing multiple regression on the dataset
fit <- lm(sm1$Trouble_falling_asleep ~ Instagram+ LinkedIn + SnapChat + Twitter+ Whatsapp + youtube + OTT + Reddit , data=sm1)
#show the results
summary(fit)
## 
## Call:
## lm(formula = sm1$Trouble_falling_asleep ~ Instagram + LinkedIn + 
##     SnapChat + Twitter + Whatsapp + youtube + OTT + Reddit, data = sm1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.47240 -0.26171 -0.04857  0.14431  0.71319 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  0.02935    0.36730   0.080    0.938  
## Instagram    0.07008    0.03479   2.014    0.067 .
## LinkedIn    -0.02846    0.04682  -0.608    0.555  
## SnapChat     0.05725    0.03387   1.690    0.117  
## Twitter      0.03816    0.14738   0.259    0.800  
## Whatsapp    -0.02922    0.03663  -0.798    0.441  
## youtube      0.04025    0.07350   0.548    0.594  
## OTT         -0.02642    0.05020  -0.526    0.608  
## Reddit      -0.02379    0.06667  -0.357    0.727  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4432 on 12 degrees of freedom
## Multiple R-squared:  0.495,  Adjusted R-squared:  0.1583 
## F-statistic:  1.47 on 8 and 12 DF,  p-value: 0.2639

#From the above summary we got p-value 0.2639 which indicates the coefficient of the predictor variable associated with this p-value is not statistically significant.The model explains approximately 49% of the variability in “trouble_falling_asleep” as indicated by the multiple R-squared value. Most of the coefficients are not statistically significant indicating weak evidence of association.

coefficients(fit)
## (Intercept)   Instagram    LinkedIn    SnapChat     Twitter    Whatsapp 
##  0.02935003  0.07007642 -0.02846038  0.05724962  0.03816260 -0.02922242 
##     youtube         OTT      Reddit 
##  0.04025224 -0.02641988 -0.02379311

#From the above we get information about the dependent variable in equation form y=b0+ b1x1 + b2x2+…+bnxn where intercept b0=0.029, and cofficients b1=0.0700,….

#The positive coefficients for Instagram,Sanapchat, Twitter,YouTube suggest a potential positive association with trouble_falling_asleep, while negative coefficients for LinkedIn, Whatsapp,OTT and Reddit imply a negative association. The intercept represents the estimated troulbe in sleep score when all predictors are zero.

fitted(fit)
##           1           2           3           4           5           6 
##  0.03767577  0.68297470 -0.14111191  0.20249080  0.85568960 -0.12780750 
##           7           8           9          10          11          12 
##  0.23700749  0.28681294  0.47239558  0.13256374  0.41692826  0.27284963 
##          13          14          15          16          17          18 
##  1.09915634  0.43420708  0.04857065  0.41826973  0.41113911  0.32212351 
##          19          20          21 
##  0.26171236  0.88332910 -0.20697699

Residual Analysis

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(data=sm1, title="Social-Media")

plot(fit, which=1) # Residuals vs Fitted

plot(fit, which=2) # Normal Q-Q plot

#In an ideal normal distribution QQ plot, the points would fall along a straight diagonal line. However, in this plot, the points show some deviation from the diagonal,this suggests the data may not fully conform to a normal distribution and could indicate the presence of outliers or other non-normal characteristics.Identifying departures from normality can inform the choice of appropriate modeling techniques.

residuals <- residuals(fit)
residuals
##           1           2           3           4           5           6 
## -0.03767577  0.31702530  0.14111191 -0.20249080  0.14431040  0.12780750 
##           7           8           9          10          11          12 
## -0.23700749  0.71318706 -0.47239558 -0.13256374  0.58307174 -0.27284963 
##          13          14          15          16          17          18 
## -0.09915634 -0.43420708 -0.04857065  0.58173027 -0.41113911 -0.32212351 
##          19          20          21 
## -0.26171236  0.11667090  0.20697699
#Plot residuals against fitted values to check for homoscedasticity
plot_resid_fitted <- ggplot() +
  geom_point(aes(x = fitted(fit), y = residuals)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(x = "Fitted Values", y = "Residuals",
       title = "Residuals vs Fitted Values Plot") +
  theme_minimal()
print(plot_resid_fitted)

#The residual vs. fitted plot is a tool used to evaluate the assumptions and adequacy of a regression model. It helps to identify whether the model adequately captures the underlying relationships in the data or if there are issues that need to be addressed. #The plot shows a pattern between the fitted values and the residuals around zero, the model is likely not appropriate.

Prediction

predict.lm(fit, data.frame(Instagram=8, LinkedIn=5, SnapChat=4, Twitter=4,
    Whatsapp=4, youtube=8, OTT=3, Reddit=4 ))
##         1 
## 0.8600045

#Here the model predicted the trouble falling asleep value for the given values.

Model Accuracy

#Make predictions using the model
predicted <- predict(fit, newdata = sm1)
#Calculating RMSE by taking the square root of the mean of the squared differences between the actual values and the predicted values (predicted)
rmse <- sqrt(mean((sm1$Trouble_falling_asleep - predicted)^2))
rmse
## [1] 0.3350081

#Low RMSE(0.335) between 0 and 1 indicates that the models predictions are quite accurate, with small deviations from the actual values.In this case, an RMSE value of 0.335 indicates that, on average, the model’s predictions deviate from the observed values by approximately 0.335 units. A lower RMSE value indicates better performance of the model.

Some Visualizations

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:psych':
## 
##     logit
#Nonlinearity
# component + residual plot
crPlots(fit)

# plot studentized residuals vs. fitted values
library(car)
spreadLevelPlot(fit)
## Warning in spreadLevelPlot.lm(fit): 
## 3 negative fitted values removed

## 
## Suggested power transformation:  0.7663289

#The plot reveals patterns in the spread of residuals across the range of fitted values. If residuals are evenly spread, it suggests homoscedasticity. The upward trend of the curve suggests increasing variability of residuals as fitted values rise, indicating potential heteroscedasticity.

#Logistic Regression

#Logistic regression is a statistical method used for binary classification problems. It predicts the probability that a given observation belongs to one of two classes. #It gives the relationship between a binary dependent variable and independent variables.

#Load required Libraries
library(dplyr)
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.2
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
lr <- glm(sm1$Trouble_falling_asleep ~ Instagram+ LinkedIn + SnapChat + Twitter+ Whatsapp + youtube + OTT + Reddit, data=sm1, family="binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(lr)
## 
## Call:
## glm(formula = sm1$Trouble_falling_asleep ~ Instagram + LinkedIn + 
##     SnapChat + Twitter + Whatsapp + youtube + OTT + Reddit, family = "binomial", 
##     data = sm1)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  -4.6978     4.7433  -0.990    0.322
## Instagram     0.7655     0.5993   1.277    0.202
## LinkedIn     -1.0183     1.1699  -0.870    0.384
## SnapChat      2.7338     3.1258   0.875    0.382
## Twitter       0.8471     1.4520   0.583    0.560
## Whatsapp     -0.3141     0.3923  -0.801    0.423
## youtube       1.0164     1.5883   0.640    0.522
## OTT          -1.5516     2.2342  -0.694    0.487
## Reddit        0.0207     0.6622   0.031    0.975
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 26.734  on 20  degrees of freedom
## Residual deviance: 11.500  on 12  degrees of freedom
## AIC: 29.5
## 
## Number of Fisher Scoring iterations: 9

#The logistic regression suggests that there is no significant predictor of trouble falling asleep. The intercept indicates a baseline of approximately -4.69 when all independent variables are zero.The model’s fit is modest, with slightly lower residual deviance compared to null deviance, and an AIC of 29.5. #From the above it is evident that the model is not acceptable since the independent variables are not contributing significantly to the variation in the dependent variable.

residuals(lr)
##             1             2             3             4             5 
## -8.920605e-05  1.569379e-02 -3.882687e-03 -1.063736e-01  1.913654e-01 
##             6             7             8             9            10 
## -9.188637e-03 -2.441611e-01  1.608947e+00 -1.052751e+00 -5.442003e-01 
##            11            12            13            14            15 
##  9.633205e-01 -4.333049e-01  2.107342e-08 -1.323641e+00 -3.486828e-01 
##            16            17            18            19            20 
##  1.765154e+00 -7.920632e-01 -6.237626e-01 -5.244625e-01  3.022679e-02 
##            21 
## -3.160719e-02
plot(lr, which = 1)

#There is a fixed pattern in the residuals vs fitted plot which means that the selected independent variables will not explain the dependent variable well.

anova(lr)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: sm1$Trouble_falling_asleep
## 
## Terms added sequentially (first to last)
## 
## 
##           Df Deviance Resid. Df Resid. Dev
## NULL                         20     26.734
## Instagram  1   5.9366        19     20.797
## LinkedIn   1   0.5658        18     20.231
## SnapChat   1   3.6014        17     16.630
## Twitter    1   0.3049        16     16.325
## Whatsapp   1   2.9495        15     13.375
## youtube    1   0.3787        14     12.997
## OTT        1   1.4962        13     11.501
## Reddit     1   0.0009        12     11.500
#"Pseudo R-squared" and its p-value
ll.null <- lr$null.deviance/-2
ll.proposed <- lr$deviance/-2
(ll.null - ll.proposed) / ll.null
## [1] 0.5698455

#The pseudo R-squared value resulting from the provided code is 0.56, it suggests that the proposed model does not fit the data perfectly. This indicates that all variability in the response variable is not well explained by the predictors, implying a highly significant improvement in model.

predicted.data <- data.frame(probability.of.hd=lr$fitted.values, Trouble_falling_asleep = sm1$Trouble_falling_asleep)
predicted.data <- predicted.data[order(predicted.data$probability.of.hd, decreasing=FALSE),]
predicted.data$rank <- 1:nrow(predicted.data)

ggplot(data=predicted.data, aes(x=rank, y=probability.of.hd)) +
  geom_point(aes(color=Trouble_falling_asleep), alpha=1, shape=4, stroke=2) +
  xlab("Index") +
  ylab("Predicted probability of getting in sleeping")

#The plot is a graphical representation of a predictive model, depicting the probability of trouble falling asleep against an index.The dotted line likely represents a fitted curve, showing how the probability changes across different index values, with a steep incline around the threshold(15). This visualization could be used to understand factors influencing trouble in sleep.

library(caret)
## Loading required package: lattice
pdata <- predict(lr,newdata=sm1,type="response" )
pdata
##            1            2            3            4            5            6 
## 3.978860e-09 9.998769e-01 7.537601e-06 5.641701e-03 9.818563e-01 4.221463e-05 
##            7            8            9           10           11           12 
## 2.936746e-02 2.740746e-01 4.254351e-01 1.376353e-01 6.287679e-01 8.960485e-02 
##           13           14           15           16           17           18 
## 1.000000e+00 5.835601e-01 5.897903e-02 2.105810e-01 2.692487e-01 1.767867e-01 
##           19           20           21 
## 1.284922e-01 9.995433e-01 4.993826e-04
pdataF <- as.factor(ifelse(test=as.numeric(pdata>0.5) == 0, yes="0", no="1"))
sm1$Trouble_falling_asleep <- factor(sm1$Trouble_falling_asleep, levels = c("0", "1"))
levels(pdataF) <- levels(sm1$Trouble_falling_asleep)
confusionMatrix(pdataF, sm1$Trouble_falling_asleep)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 13  2
##          1  1  5
##                                           
##                Accuracy : 0.8571          
##                  95% CI : (0.6366, 0.9695)
##     No Information Rate : 0.6667          
##     P-Value [Acc > NIR] : 0.04616         
##                                           
##                   Kappa : 0.6667          
##                                           
##  Mcnemar's Test P-Value : 1.00000         
##                                           
##             Sensitivity : 0.9286          
##             Specificity : 0.7143          
##          Pos Pred Value : 0.8667          
##          Neg Pred Value : 0.8333          
##              Prevalence : 0.6667          
##          Detection Rate : 0.6190          
##    Detection Prevalence : 0.7143          
##       Balanced Accuracy : 0.8214          
##                                           
##        'Positive' Class : 0               
## 
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
roc(sm1$Trouble_falling_asleep, lr$fitted.values, plot=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

## 
## Call:
## roc.default(response = sm1$Trouble_falling_asleep, predictor = lr$fitted.values,     plot = TRUE)
## 
## Data: lr$fitted.values in 14 controls (sm1$Trouble_falling_asleep 0) < 7 cases (sm1$Trouble_falling_asleep 1).
## Area under the curve: 0.949

#Discriminant Analysis

# Splitting the dataset into 75% training and 25% test sets
smp_size_raw <- floor(0.75 * nrow(sm1))
train_ind_raw <- sample(nrow(sm1), size = smp_size_raw)
train_raw.df <- sm[train_ind_raw, ]
test_raw.df <- sm[-train_ind_raw, ]

1.Model Development

lda_model <- lda(train_raw.df$Trouble_falling_asleep ~ Instagram + LinkedIn + SnapChat + Twitter + Whatsapp  + youtube + OTT + Reddit, data = train_raw.df)


lda_model
## Call:
## lda(train_raw.df$Trouble_falling_asleep ~ Instagram + LinkedIn + 
##     SnapChat + Twitter + Whatsapp + youtube + OTT + Reddit, data = train_raw.df)
## 
## Prior probabilities of groups:
##         0         1 
## 0.5333333 0.4666667 
## 
## Group means:
##   Instagram LinkedIn SnapChat   Twitter Whatsapp  youtube      OTT    Reddit
## 0   4.15000 4.520833 1.181250 0.9791667 5.887500 3.097917 3.547917 1.1875000
## 1   8.37619 3.198095 4.242857 0.4404762 6.079048 3.065238 2.358571 0.2157143
## 
## Coefficients of linear discriminants:
##                    LD1
## Instagram  0.344391244
## LinkedIn  -0.185506018
## SnapChat   0.166981292
## Twitter    0.480715943
## Whatsapp  -0.002749699
## youtube    0.228992485
## OTT       -0.247118598
## Reddit    -0.126181364

#Prior probability shows the class distribution of each class in the training data.The data is classified into 2 groups based on the peoples trouble in sleep and LD1 having few significant coefficients.Among the groups 0 and 1 people facing trouble in falling asleep is the most predominant group. #Coefficients of Linear Discriminants: The coefficients of linear discriminants represent the weights assigned to each predictor variable in the discriminant function.

summary(lda_model)
##         Length Class  Mode     
## prior    2     -none- numeric  
## counts   2     -none- numeric  
## means   16     -none- numeric  
## scaling  8     -none- numeric  
## lev      2     -none- character
## svd      1     -none- numeric  
## N        1     -none- numeric  
## call     3     -none- call     
## terms    3     terms  call     
## xlevels  0     -none- list

##No obvious issues with class imbalance, the LDA model is likely acceptable.

plot(lda_model)

###Residual Analysis

residuals(lda_model)
## NULL

#LDA does not inherently produce residuals

###Prediction

prediction <- predict(lda_model, test_raw.df)
prediction
## $class
## [1] 0 0 1 1 1 0
## Levels: 0 1
## 
## $posterior
##           0           1
## 1 0.9349435 0.065056536
## 2 0.7132359 0.286764137
## 3 0.3242192 0.675780806
## 4 0.0251472 0.974852804
## 5 0.3744611 0.625538931
## 6 0.9940401 0.005959924
## 
## $x
##          LD1
## 1 -0.9932902
## 2 -0.2505513
## 3  0.4462550
## 4  1.6839962
## 5  0.3525388
## 6 -2.0313388

###Accuracy

# Predict on the test set
predicted_classes <- predict(lda_model, test_raw.df)$class

# Create a confusion matrix to understand misclassifications
confusion_matrix <- table(predicted_classes, test_raw.df$Trouble_falling_asleep)
confusion_matrix 
##                  
## predicted_classes 0
##                 0 3
##                 1 3
accuracy <- sum(predicted_classes == test_raw.df$Trouble_falling_asleep) / nrow(test_raw.df)
accuracy 
## [1] 0.5

#Acuuracy is moderately acceptable

str(train_raw.df)
## tibble [15 × 13] (S3: tbl_df/tbl/data.frame)
##  $ character             : chr [1:15] "19!@s" "peace" "vp1234" "ds2134" ...
##  $ Instagram             : num [1:15] 7 7.733 7 0.167 6.8 ...
##  $ LinkedIn              : num [1:15] 4 5.2 5 0 1.92 ...
##  $ SnapChat              : num [1:15] 3 3.683 0.417 0 1.867 ...
##  $ Twitter               : num [1:15] 0 0 0 0 0 ...
##  $ Whatsapp              : num [1:15] 10 4.18 5 1 6.95 ...
##  $ youtube               : num [1:15] 2 4.25 5 3 0.8 2.5 0.54 1.85 3.5 7 ...
##  $ OTT                   : num [1:15] 3 0 1 0 2.47 ...
##  $ Reddit                : num [1:15] 0 0 0.5 0 0 2.5 0.01 0 1 0 ...
##  $ Trouble_falling_asleep: num [1:15] 1 1 1 0 0 0 1 0 1 0 ...
##  $ productivity          : num [1:15] 1 1 1 0 1 1 1 1 1 1 ...
##  $ Tired_morning         : num [1:15] 1 0 1 0 1 0 1 0 1 0 ...
##  $ weekenergy            : num [1:15] 3 3 5 2 3 3 4 4 3 5 ...
library(klaR)
attach(train_raw.df)
train_raw.df$Trouble_falling_asleep <- factor(train_raw.df$Trouble_falling_asleep)
partimat( Trouble_falling_asleep ~Instagram+LinkedIn+SnapChat+Twitter+youtube+OTT+Reddit, data=train_raw.df, method="lda")

Learnings and Takeaway

I can conclude that by using MVA models I am able to compute the effect of each independent varible’s on the dependent variables.This really helped me to understand the impact of various social media apps usage on the healthy/unhealthy lifestyle of the students which is my main goal of conducting this analysis. From exploratory data analysis it is clearly evident that Instagram and Whatsapp are the two predominant apps among students and hours spent on it having a direct relationship with sleep deprivation, diminished productivity, poor energy and tiredness. And also it was proven from my Exploratory data analysis that many students are using social media apps for a good reason. Either good or bad having control over the usage of social media apps is beneficial for both physical and mental health of the students.